Global_fishing_watch_workflow_envpreds

Author

Théophile L. Mouton

Published

September 18, 2024

Combining AIS and Sentinel-1 SAR fishing effort data from Global Fishing Watch

Open R libraries

Code
# Load required libraries
library(tidyverse)
library(maps)
library(ggplot2)
library(data.table)
library(gridExtra)
library(raster)
library(sf)
library(viridis)
library(scales)
library(dplyr)
library(randomForest)
library(caret)
library(pdp)
library(knitr)
library(kableExtra)
library(future)
library(spdep)
library(ncf)
library(blockCV)
library(doParallel)
library(sp)

AIS data

Data from Kroodsma et al. (2018) Science, accessible at: Global Fishing Watch Data Download Portal.

The data is accessible up to the end of the year 2020, we used four entire years of data (2017, 2018, 2019, 2020) to match SAR data records.

Code
# Set the path to the 2017-2020 folder

#path <- "Data/AIS Fishing Effort 2017-2020"

# List all CSV files in the folder
#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)

# Read all CSV files and combine them into a single data frame
#AIS_fishing <- AIS_csv_files %>%
#  map_df(~read_csv(.))

load(here::here("Data","AIS_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
  group_by(cell_ll_lat, cell_ll_lon) %>%
  summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
  ungroup() %>%
  # Remove any cells with zero or negative fishing hours
  filter(total_fishing_hours > 0)

# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
  list(
    lon_std = floor(lon * 10) / 10,
    lat_std = floor(lat * 10) / 10
  )
}

# Standardize and aggregate AIS data
AIS_data_std <- aggregated_AIS_fishing %>%
  mutate(coords = map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
  mutate(
    lon_std = map_dbl(coords, ~ .x$lon_std),
    lat_std = map_dbl(coords, ~ .x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")

# Create the world map
world_map <- map_data("world")

# Create the plot
ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = AIS_data_std, 
            aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = NULL,
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

Sentinel-1 SAR data

Data from Paolo et al. 2024 Nature, accessible at: Global Fishing Watch SAR Vessel Detections

The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.

Code
# Set the path to the 2016 folder
#path <- "Data/SAR Vessel detections 2017-2020"

# List all CSV files in the folder
#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)

# Read all CSV files and combine them into a single data frame
#SAR_fishing <- SAR_csv_files %>%
#  map_df(~read_csv(.))

load(here::here("Data","SAR_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
  mutate(
    lat_rounded = round(lat, digits = 2),
    lon_rounded = round(lon, digits = 2)
  ) %>%
  group_by(lat_rounded, lon_rounded) %>%
  filter(fishing_score >= 0.9) %>%
  summarise(
    total_presence_score = sum(presence_score, na.rm = TRUE),
    avg_fishing_score = mean(fishing_score, na.rm = TRUE),
    count = n()
  ) %>%
  mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
  ungroup()

# Standardize and aggregate SAR data
SAR_data_std <- aggregated_SAR_fishing %>%
  mutate(coords = map2(lon_rounded, lat_rounded, standardize_coords)) %>%
  mutate(
    lon_std = map_dbl(coords, ~ .x$lon_std),
    lat_std = map_dbl(coords, ~ .x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")

# Create the world map
world_map <- map_data("world")

# Create the plot
ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = SAR_data_std, 
            aes(x = lon_std, y = lat_std, fill = total_presence_score)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "Fishing vessel detections (2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = NULL,
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

Compare AIS, and SAR detection locations

Identifying grid cells with only AIS, only SAR detections or both data types.

Code
# Merge the datasets
combined_data <- full_join(
  AIS_data_std,
  SAR_data_std,
  by = c("lon_std", "lat_std")
)

# Categorize each cell
combined_data <- combined_data %>%
  mutate(category = case_when(
    total_fishing_hours > 0 & total_presence_score > 0 ~ "Both AIS and SAR",
    total_fishing_hours > 0 & (is.na(total_presence_score) | total_presence_score == 0) ~ "Only AIS",
    (is.na(total_fishing_hours) | total_fishing_hours == 0) & total_presence_score > 0 ~ "Only SAR",
    TRUE ~ "No fishing detected"
  ))

# Create the world map
world_map <- map_data("world")

# Create the plot
world_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data, 
            aes(x = lon_std, y = lat_std, fill = category)) +
  scale_fill_manual(
    values = c("Both AIS and SAR" = "purple", 
               "Only AIS" = "blue", 
               "Only SAR" = "red", 
               "No fishing detected" = "white"),
    name = "Fishing Data Source",
    guide = guide_legend(title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Detection",
       subtitle = "Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Display the plot
print(world_plot)

Code
# Get summary statistics
summary_stats <- combined_data %>% 
  count(category) %>% 
  mutate(percentage = n / sum(n) * 100) %>%
  rename(`Number of cells` = n) %>%
  mutate(percentage = round(percentage, 2))

# Create and display the table
kable(summary_stats, 
      format = "html", 
      col.names = c("Category", "Number of cells", "Percentage (%)"),
      caption = "Summary Statistics of Data Categories") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, 
                position = "center") %>%
  column_spec(2, width = "150px") %>%
  column_spec(3, width = "150px")
Summary Statistics of Data Categories
Category Number of cells Percentage (%)
Both AIS and SAR 163095 9.12
Only AIS 1566190 87.60
Only SAR 58668 3.28

Random forest model

Predicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of >160,000 associations between SAR vessel detections and AIS fishing hours globally, geographical coordinates, distance to ports, distance to shore and bathymetry.

Code
# Open the saved adjusted rasters
Ports_adjusted <- raster(here::here("Data", "Environmental data layers", "distance-from-port-0.1deg-adjusted.tif"))
Shore_adjusted <- raster(here::here("Data", "Environmental data layers", "distance-from-shore-0.1deg-adjusted.tif"))
Bathy_adjusted <- raster(here::here("Data", "Environmental data layers", "bathymetry-0.1deg-adjusted.tif"))

# Stack the resampled rasters
raster_stack <- stack(Shore_adjusted, Ports_adjusted, Bathy_adjusted)

# Convert the stack to a dataframe
raster_df <- as.data.frame(raster_stack, xy = TRUE)

# Rename the columns
names(raster_df) <- c("x", "y", "dist_shore", "dist_ports", "bathy")

# Remove NA values if desired
raster_df <- na.omit(raster_df)

# Convert to data.table for efficiency
setDT(raster_df)

# Round x and y to 1 decimal place for consistency
raster_df[, `:=`(
  lon_std = round(x, digits = 1),
  lat_std = round(y, digits = 1)
)]

# Now proceed with the join and model training
load(here::here("data","combined_data_O1deg.Rdata"))
setDT(combined_data_01deg)

# Perform the join using data.table
combined_data_with_rasters <- raster_df[combined_data_01deg, 
                                        on = .(lon_std, lat_std), 
                                        nomatch = 0]

# Convert back to dataframe if needed
combined_data_with_rasters <- as.data.frame(combined_data_with_rasters)

# Prepare the training data
training_data <- combined_data_with_rasters %>%
  filter(has_AIS & has_SAR) %>%
  dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%
  na.omit()

# Train the random forest model with timing
set.seed(123)  # for reproducibility
#rf_timing <- system.time({
#  rf_model_env <- randomForest(
#    total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,
#    data = training_data,
#    ntree = 500,
#    importance = TRUE
#  )
#})

#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")

# Save the new model
#save(rf_model_env, file = "rf_model_01deg_with_rasters.Rdata")
load(here::here("rf_model_01deg_with_rasters.Rdata"))

Maps of predictions

Code
# Prepare the prediction data
prediction_data <- combined_data_with_rasters %>%
  dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)

# Make predictions
predictions <- predict(rf_model_env, newdata = prediction_data)

# Add predictions to the original dataset
combined_data_01deg <- combined_data_01deg %>%
  mutate(
    predicted_fishing_hours = case_when(
      has_AIS ~ total_fishing_hours,
      has_SAR ~ predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],
      TRUE ~ 0
    )
  )

# Create the world map
world_map <- map_data("world")

# Map of predicted fishing hours only 
# Prepare the data for the map
map_data <- combined_data_01deg %>%
  filter(!has_AIS & has_SAR) %>%
  dplyr::select(lon_std, lat_std, predicted_fishing_hours)

# Function to create map for a specific region
create_region_map <- function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle) {
  ggplot() +
    geom_map(data = world_map, map = world_map,
             aes(long, lat, map_id = region),
             color = "darkgray", fill = "lightgray", size = 0.1) +
    geom_tile(data = data, 
              aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = predicted_fishing_hours)) +
    scale_fill_viridis(
      option = "inferno",
      direction = -1,
      trans = "log1p",
      name = "Predicted fishing hours (2017-2020)", 
      breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
      labels = scales::comma,
      guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
    ) +
    coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +
    theme_minimal() +
    labs(title = title,
         subtitle = subtitle,
         x = "Longitude", y = "Latitude") +
    theme(
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "vertical",
      legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
      legend.title = element_text(margin = ggplot2::margin(b = 10))
    )
}

# Global map
predicted_SAR_only_plot <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution")

# Caribbean map
caribbean_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution")

# Northwestern Indian Ocean to Western European waters map
indian_european_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution")

# Asia map
asia_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution")

# Print the maps
print(predicted_SAR_only_plot)

Code
print(caribbean_map)

Code
print(indian_european_map)

Code
print(asia_map)

Code
#Map of both original and predicted AIS fishing hours 
# Visualize the results
predicted_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data_01deg, 
            aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Hours (0.1 degree resolution)",
       subtitle = "Based on AIS data and Random Forest predictions from SAR data",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

print(predicted_plot)

Code
#Aggregate data to 0.5 degree resolution 
# First, round the coordinates to the nearest 0.5 degree
combined_data_05deg <- combined_data_01deg %>%
  mutate(
    lon_05deg = round(lon_std * 2) / 2,
    lat_05deg = round(lat_std * 2) / 2
  )

# Aggregate the data
aggregated_data <- combined_data_05deg %>%
  group_by(lon_05deg, lat_05deg) %>%
  summarise(
    predicted_fishing_hours = sum(predicted_fishing_hours, na.rm = TRUE),
    .groups = 'drop'
  )

save(aggregated_data, file=here::here("Predicted_Fishing_Hours_05Deg.Rdata"))

# Create the map
predicted_plot_05deg <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = aggregated_data, 
            aes(x = lon_05deg, y = lat_05deg, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Hours (0.5 degree resolution)",
       subtitle = "Based on AIS data and Random Forest predictions from SAR data",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Print the map
print(predicted_plot_05deg)

Code
# Caribbean map
caribbean_map <- create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution")

# Northwestern Indian Ocean to Western European waters map
indian_european_map <- create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")

# Asia map
asia_map <- create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution")

# Print the maps
print(caribbean_map)

Code
print(indian_european_map)

Code
print(asia_map)

Model performance

Code
evaluate_model <- function(model, data, log_target = FALSE) {
  predictions <- predict(model, newdata = data)
  if (log_target) {
    predictions <- 10^predictions - 1
  }
  
  actual <- data$total_fishing_hours
  
  # Basic Error Metrics
  mae <- mean(abs(actual - predictions), na.rm = TRUE)
  rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
  mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
  medae <- median(abs(actual - predictions), na.rm = TRUE)
  
  # R-squared (matching randomForest's % Var explained)
  r_squared <- model$rsq[length(model$rsq)]
  
  # Adjusted R-squared
  n <- length(actual)
  p <- length(model$forest$independent.variable.names) # Number of predictors
  adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  
  # Residual Analysis
  residuals <- actual - predictions
  mean_residual <- mean(residuals, na.rm = TRUE)
  sd_residual <- sd(residuals, na.rm = TRUE)
  
  # Feature Importance (for Random Forest)
  feature_importance <- importance(model)
  
  return(list(
    "Mean Absolute Error" = mae,
    "Root Mean Squared Error" = rmse,
    "Mean Absolute Percentage Error" = mape,
    "Median Absolute Error" = medae,
    "R-Squared" = r_squared,
    "Adjusted R-Squared" = adj_r_squared,
    "Mean of Residuals" = mean_residual,
    "Standard Deviation of Residuals" = sd_residual,
    "Feature Importance" = feature_importance
  ))
}

validation_data <- combined_data_with_rasters %>%
  mutate(
    data_category = case_when(
      has_AIS & has_SAR ~ "Both AIS and SAR",
      has_AIS & !has_SAR ~ "Only AIS",
      !has_AIS & has_SAR ~ "Only SAR",
      TRUE ~ "No fishing detected"
    )
  )

# Evaluate all models
validation_data <- validation_data %>% filter(data_category == "Both AIS and SAR")
results_rf_env <- evaluate_model(rf_model_env, validation_data)

# Create a dataframe for the table
results_table <- data.frame(
  Metric = c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error",
             "Median Absolute Error", "R-Squared", "Adjusted R-Squared",
             "Mean of Residuals", "Standard Deviation of Residuals"),
  Value = round(c(results_rf_env$`Mean Absolute Error`,
                  results_rf_env$`Root Mean Squared Error`,
                  results_rf_env$`Mean Absolute Percentage Error`,
                  results_rf_env$`Median Absolute Error`,
                  results_rf_env$`R-Squared`,
                  results_rf_env$`Adjusted R-Squared`,
                  results_rf_env$`Mean of Residuals`,
                  results_rf_env$`Standard Deviation of Residuals`),
                2)  # Round to 2 decimal places
)

# Create and display the table
kable(results_table, format = "html", digits = 4, caption = "Model Evaluation Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center")
Model Evaluation Metrics
Metric Value
Mean Absolute Error 174.91
Root Mean Squared Error 810.11
Mean Absolute Percentage Error 1250.65
Median Absolute Error 22.69
R-Squared 0.81
Adjusted R-Squared 0.81
Mean of Residuals -3.60
Standard Deviation of Residuals 810.11
Code
# For feature importance, create a separate table
feature_importance <- as.data.frame(results_rf_env$`Feature Importance`)
feature_importance$Feature <- rownames(feature_importance)
feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]
colnames(feature_importance) <- c("Feature", "%IncMSE", "IncNodePurity")

# Sort the feature importance table by %IncMSE in descending order
feature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]

kable(feature_importance, format = "html", digits = 4, 
      col.names = c("Feature", "%IncMSE", "IncNodePurity"),
      caption = "Feature Importance") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, width = "150px")
Feature Importance
Feature %IncMSE IncNodePurity
total_presence_score total_presence_score 127.1595 737080845352
lon_std lon_std 80.5654 557956385485
dist_ports dist_ports 55.9307 312138319465
bathy bathy 48.4341 238192589722
lat_std lat_std 48.0964 540028288659
dist_shore dist_shore 46.3306 213555877780

Test for spatial auto-correlation

Code
##------------ Create a non-random test set 
# Prepare the data
data <- combined_data_with_rasters %>%
  filter(has_AIS & has_SAR) %>%
  dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%
  na.omit()

# Split the data into training and test sets
set.seed(123)  # for reproducibility
train_indices <- sample(1:nrow(data), 0.7 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]

# Train the random forest model
#set.seed(123)  # for reproducibility
#rf_timing <- system.time({
#  rf_model_env <- randomForest(
#    total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,
#    data = train_data,
#    ntree = 500,
#    importance = TRUE
#  )
#})

#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")

# Save the new model
#rf_model_env_train=rf_model_env
#save(rf_model_env_train, file = "rf_model_01deg_with_rasters_train.Rdata")
load(here::here("rf_model_01deg_with_rasters_train.Rdata"))

# Make predictions on the test set
test_data$predictions <- predict(rf_model_env_train, test_data)

# Calculate residuals
test_data$residuals <- test_data$total_fishing_hours - test_data$predictions

# Prepare spatial data for autocorrelation analysis
test_data_sf <- st_as_sf(test_data, coords = c("lon_std", "lat_std"), crs = 4326)

# Create a neighbors list
k <- 8  # You can adjust this number
nb <- knearneigh(st_coordinates(test_data_sf), k = k)
nb <- knn2nb(nb)

# Create a spatial weights matrix
lw <- nb2listw(nb, style="W")

# Perform Moran's I test on the test set residuals
moran_test <- moran.test(test_data_sf$residuals, lw)

# Create a data frame with the test results
  moran_results <- data.frame(
    Statistic = c("Moran I statistic", "Expectation", "Variance", "Standard deviate", "P-value"),
    Value = c(
      round(moran_test$estimate[1],2),
      round(moran_test$estimate[2],2),
      round(moran_test$estimate[3],2),
      round(moran_test$statistic,2),
      ifelse(moran_test$p.value < 2.2e-16, "< 2.2e-16", format(moran_test$p.value, scientific = TRUE, digits = 4))
    )
  )
  
 # Create and display the table
  kable(moran_results, format = "html", col.names = c("Statistic", "Value"),
        caption = "Moran I Test under randomisation") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = FALSE,
                  position = "center") %>%
    column_spec(1, bold = TRUE) %>%
    column_spec(2, width = "15em")
Moran I Test under randomisation
Statistic Value
Moran I statistic Moran I statistic 0.25
Expectation Expectation 0
Variance Variance 0
Moran I statistic standard deviate Standard deviate 115.35
P-value < 2.2e-16

Spatial cross-validation

Code
# Assuming your training_data has lon_std (longitude) and lat_std (latitude) columns
coords <- SpatialPoints(training_data[, c("lon_std", "lat_std")])

# Combine your training data with spatial coordinates
spatial_data <- SpatialPointsDataFrame(coords, data = training_data)

#Testing to split the coordinates with k-means clustering
# Ensure spatial_data is an sf object
if (!inherits(spatial_data, "sf")) {
  spatial_data_sf <- st_as_sf(spatial_data)
} else {
  spatial_data_sf <- spatial_data
}

# Extract coordinates
coords <- st_coordinates(spatial_data_sf)

# Perform k-means clustering
set.seed(123)  # for reproducibility
kmeans_result <- kmeans(coords, centers = 6)

# Add cluster assignments to the sf object
spatial_data_sf$block <- as.factor(kmeans_result$cluster)

# Create the plot
ggplot() +
  geom_sf(data = spatial_data_sf, aes(color = block), size = 1) +
  scale_color_viridis_d(name = "Block") +
  theme_minimal() +
  labs(title = "Spatial Blocks for 6-Fold Cross-Validation",
       subtitle = "Points colored by block assignment") +
  theme(legend.position = "bottom")

Code
# Register cores for parallel processing (use 1 less than your total cores)
#cl <- makeCluster(detectCores() - 1)
#registerDoParallel(cl)

# Initialize a list to store results
#results <- foreach(i = 1:6, .packages = c("randomForest", "dplyr"), .export = "spatial_data_sf") %dopar% {
  
  # Get the indices of the training and test sets for the i-th block
#  test_indices <- spatial_data_sf$block == i
#  train_data <- spatial_data_sf[!test_indices, ]
#  test_data <- spatial_data_sf[test_indices, ]
  
  # Train Random Forest on the training data
#  rf_model <- randomForest(
#    total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,
#    data = train_data,
#    ntree = 500,
#    importance = TRUE
#  )
  
  # Save the model
#  model_filename <- paste0("rf_model_fold_", i, ".rds")
#  saveRDS(rf_model, file = model_filename)
  
  # Predict on the test data
#  predictions <- predict(rf_model, test_data)
  
  # Return a list with results and model filename
#  list(
#    results = data.frame(observed = test_data$total_fishing_hours, predicted = predictions, fold = i),
#    model_filename = model_filename
#  )
#}

# Stop the cluster once done
#stopCluster(cl)

# Extract results and model filenames
#results_df <- do.call(rbind, lapply(results, function(x) x$results))
#model_filenames <- sapply(results, function(x) x$model_filename)

# Save results
#save(results_df, file = "spatial_cv_results.RData")
#save(model_filenames, file = "model_filenames.RData")

#load(here::here("spatial_cv_results.RData"))
#load(here::here("model_filenames.RData"))

# Ensure spatial_data and results_df are aligned
#spatial_data <- spatial_data[order(row.names(spatial_data)), ]
#results_df <- results_df[order(row.names(results_df)), ]

# Get all coordinates
#all_coordinates <- coordinates(spatial_data)

# Create neighbor list for the entire dataset
#k <- 30  # Number of nearest neighbors, adjust as needed
#nb_all <- knn2nb(knearneigh(all_coordinates, k = k))

# Create spatial weights for the entire dataset
#lw_all <- nb2listw(nb_all, style="W", zero.policy = TRUE)

# Function to calculate Moran's I
#calculate_morans_i <- function(residuals, indices, lw_all) {
  # Create a logical vector for subsetting
#  subset_vector <- rep(FALSE, length(lw_all$neighbours))
#  subset_vector[indices] <- TRUE
  
  # Subset the weights list for the current fold
#  lw_subset <- subset.listw(lw_all, subset_vector, zero.policy = TRUE)
  
  # Calculate Moran's I
#  moran_result <- moran.test(residuals, lw_subset, zero.policy = TRUE)
#  return(moran_result)
#}

# Calculate Moran's I for each fold in the spatial cross-validation
#morans_i_results <- list()

#for (i in 1:6) {
  # Get the indices for this fold
#  fold_indices <- which(results_df$fold == i)
  
  # Calculate residuals
#  fold_residuals <- results_df$observed[fold_indices] - results_df$predicted[fold_indices]
  
  # Calculate Moran's I for this fold
#  morans_i_results[[i]] <- calculate_morans_i(fold_residuals, fold_indices, lw_all)
  
  # Print progress
#  print(paste("Completed fold", i))
#}

#save(morans_i_results, file="morans_i_results.Rdata")
load(here::here("morans_i_results.Rdata"))

# Print results
#print("Moran's I for each fold in spatial cross-validation:")
#for (i in 1:5) {
#  print(paste("Fold", i))
#  print(morans_i_results[[i]])
#}

# Extract relevant information from Moran's I results
morans_table <- data.frame(
  Fold = 1:6,
  Moran_I = sapply(morans_i_results, function(x) x$estimate[1]),
  Expectation = sapply(morans_i_results, function(x) x$estimate[2]),
  Variance = sapply(morans_i_results, function(x) x$estimate[3]),
  Std_Deviate = sapply(morans_i_results, function(x) x$statistic),
  P_Value = sapply(morans_i_results, function(x) x$p.value)
)

# Format the table
morans_table_formatted <- morans_table %>%
  mutate(
    Moran_I = round(Moran_I, 4),
    Expectation = format(Expectation, scientific = TRUE, digits = 4),
    Variance = format(Variance, scientific = TRUE, digits = 4),
    Std_Deviate = round(Std_Deviate, 2),
    P_Value = ifelse(P_Value < 2.2e-16, "< 2.2e-16", 
                     format(P_Value, scientific = TRUE, digits = 4))
  )

# Create and print the table using kable and kable_styling
kable(morans_table_formatted,
                            col.names = c("Fold", "Moran's I", "Expectation", "Variance", "Std. Deviate", "p-value"),
                            caption = "Moran's I Results for Each Fold in Spatial Cross-Validation",
                            align = c('c', 'r', 'r', 'r', 'r', 'r'),
                            format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:6, width = "150px")
Moran's I Results for Each Fold in Spatial Cross-Validation
Fold Moran's I Expectation Variance Std. Deviate p-value
1 0.4423 -7.330e-05 4.410e-06 210.64 < 2.2e-16
2 0.4504 -1.275e-05 8.039e-07 502.39 < 2.2e-16
3 0.3825 -7.548e-05 4.594e-06 178.51 < 2.2e-16
4 0.6267 -3.090e-05 1.907e-06 453.87 < 2.2e-16
5 0.3767 -9.321e-05 5.730e-06 157.42 < 2.2e-16
6 0.4118 -6.942e-05 4.234e-06 200.15 < 2.2e-16

Spatially blocked predictions

Code
# Load necessary libraries
library(raster)
library(ggplot2)
library(viridis)
library(dplyr)
library(data.table)

# Load the model filenames
load(here::here("model_filenames.RData"))

# Function to make predictions using all 6 models
make_predictions <- function(data) {
  predictions <- matrix(0, nrow = nrow(data), ncol = 6)
  
  for (i in 1:6) {
    model <- readRDS(here::here(model_filenames[i]))
    predictions[, i] <- predict(model, data)
  }
  
  # Take the average prediction across all models
  rowMeans(predictions)
}

# Prepare the prediction data (same as before)
prediction_data <- combined_data_with_rasters %>%
  dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)

# Make predictions using the spatially blocked models
new_predictions <- make_predictions(prediction_data)

# Add predictions to the original dataset
combined_data_01deg <- combined_data_01deg %>%
  mutate(
    predicted_fishing_hours = case_when(
      has_AIS ~ total_fishing_hours,
      has_SAR ~ new_predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],
      TRUE ~ 0
    )
  )

# Now you can use the same mapping code as before, but it will use the new predictions

# Map of predicted fishing hours only 
map_data <- combined_data_01deg %>%
  filter(!has_AIS & has_SAR) %>%
  dplyr::select(lon_std, lat_std, predicted_fishing_hours)

# Use your create_region_map function to create the maps
# Global map
predicted_SAR_only_plot <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution (Spatially Blocked Model)")

# Caribbean map
caribbean_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution (Spatially Blocked Model)")

# Northwestern Indian Ocean to Western European waters map
indian_european_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution (Spatially Blocked Model)")

# Asia map
asia_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution (Spatially Blocked Model)")

# Print the maps
print(predicted_SAR_only_plot)

Code
print(caribbean_map)

Code
print(indian_european_map)

Code
print(asia_map)

Code
# Create the global map of both original and predicted fishing hours
predicted_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data_01deg, 
            aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Hours (0.1 degree resolution)",
       subtitle = "Based on AIS data and Spatially Blocked Random Forest predictions from SAR data",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

print(predicted_plot)

Code
# Aggregate data to 0.5 degree resolution
combined_data_05deg <- combined_data_01deg %>%
  mutate(
    lon_05deg = round(lon_std * 2) / 2,
    lat_05deg = round(lat_std * 2) / 2
  ) %>%
  group_by(lon_05deg, lat_05deg) %>%
  summarize(
    predicted_fishing_hours = sum(predicted_fishing_hours, na.rm = TRUE)
  ) %>%
  ungroup()

# Global map (0.5 degree)
global_map_05deg <- create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", 
                                      c(-180, 180), c(-90, 90), 
                                      "Global Fishing Hours", 
                                      "0.5 degree resolution (Observed and Predicted)")

# Caribbean map (0.5 degree)
caribbean_map_05deg <- create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", 
                                         c(-100, -50), c(0, 40), 
                                         "Fishing Hours in the Caribbean", 
                                         "0.5 degree resolution (Observed and Predicted)")

# Northwestern Indian Ocean to Western European waters map (0.5 degree)
indian_european_map_05deg <- create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", 
                                               c(-20, 80), c(0, 70), 
                                               "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", 
                                               "0.5 degree resolution (Observed and Predicted)")

# Asia map (0.5 degree)
asia_map_05deg <- create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", 
                                    c(60, 180), c(-20, 60), 
                                    "Fishing Hours in Asia", 
                                    "0.5 degree resolution (Observed and Predicted)")

# Print the maps
print(global_map_05deg)

Code
print(caribbean_map_05deg)

Code
print(indian_european_map_05deg)

Code
print(asia_map_05deg)